
# Merging hourly unit generation data with the excess load data to be used to 
# fit tobit model 
library(pacman)
p_load(
  here, fst, data.table, magrittr, janitor, readxl, lutz,
  dplyr, lubridate, stringr, purrr
)

# Loading data ----------------------------------------------------------------
# Unit information table 
# Filtering to just units open all year
unit_info_dt = read.fst(
  here("Data/electricity-generation/unit-info-dt.fst"),
  as.data.table = TRUE
)[open_status == 'fully operational']


# Hourly emissions data 
cems_dt = 
  map_dfr(
    2018:2020,
    \(yr){
    cems_dt = read.fst(
        here(paste0("Data/electricity-generation/cems/raw_cems_",yr,".fst")),
        as.data.table = TRUE
      )[,year := yr][
        year == 2019 
        | year == 2018 & month(mdy(op_date)) == 12
        | year == 2020 & month(mdy(op_date)) == 1
      ]
    }
  )|>
  merge( 
    unit_info_dt[,.(orispl_code, unitid, tz)],
    by = c("orispl_code",'unitid'),#,"year"),
  ) 


# Converting local to UTC -----------------------------------------------------
# Calculating UTC for each hour in each timezone
# TODO: DST messing things up, since 03-10-2019 2am technically doesn't exist
local_to_utc_dt = 
  cems_dt[!is.na(tz),.(op_date, op_hour, tz)] |> 
  unique() |>
  rowwise() |>
  mutate(
    local_time = mdy_h(paste0(op_date," ",op_hour), tz = tz),
    utc_time = with_tz(local_time, "UTC")
  ) |>
  data.table()

# Merging with raw generation data
elec_gen_dt = 
  merge(
    cems_dt,
    local_to_utc_dt, 
    by = c("op_date","op_hour","tz")
  ) 

# Filtering to just 2019
elec_gen_dt = elec_gen_dt[year(utc_time) == 2019]
elec_gen_dt |> setkey(utc_time, orispl_code, unitid)

# Making sure every unit has a row for every hour 
all_unit_hour_dt = 
  merge(
    data.table(
      orispl_code = rep(unit_info_dt$orispl_code, each = 365*24),
      unitid = rep(unit_info_dt$unitid, each = 365*24),
      utc_time = seq(
        ymd_hms('2019-01-01 00:00:00'),
        ymd_hms('2019-12-31 23:00:00'), 
        by = 'hour'
      )),
    unit_info_dt[,-'year'], 
    by = c('orispl_code','unitid')
  ) |>
  setkey(utc_time, orispl_code, unitid)

rm(cems_dt, local_to_utc_dt)
invisible(gc())

elec_gen_dt = 
  merge(
    all_unit_hour_dt, 
    elec_gen_dt[,-'tz'], 
    by = c('utc_time','orispl_code', 'unitid'),
    all.x = TRUE
  )

# Setting missing values to 0 
elec_gen_dt[,':='(
  gload_mw = ifelse(is.na(gload_mw), 0, gload_mw),
  so2_mass_lbs = ifelse(is.na(so2_mass_lbs), 0, so2_mass_lbs),
  nox_mass_lbs = ifelse(is.na(nox_mass_lbs), 0, nox_mass_lbs),
  co2_mass_tons = ifelse(is.na(co2_mass_tons), 0, co2_mass_tons),
  heat_input_mm_btu = ifelse(is.na(heat_input_mm_btu), 0, heat_input_mm_btu),
  sload_1000lb_hr = ifelse(is.na(sload_1000lb_hr), 0, sload_1000lb_hr),
  op_time = ifelse(is.na(op_time), 0, op_time)
)]

rm(all_unit_hour_dt)
invisible(gc())

# Adding BA and region load data to generation  -------------------------------
# Excess load by balancing authority 
ba_load_dt = 
  read.fst(
    here("Data/electricity-generation/clean-load/ba-load-dt.fst"),
    as.data.table = TRUE
  )
nerc_load_dt = 
  read.fst(
    path = here("Data/electricity-generation/clean-load/nerc-load-dt.fst"),
    as.data.table = TRUE
  )
# Merging generation to BA load 
elec_gen_dt =   
  merge(
    elec_gen_dt,
    ba_load_dt,
    by = c("ba_code","utc_time"),
    all.x = TRUE
  )

# Casting region load into wide form
nerc_eload_dt = dcast(
  nerc_load_dt,
  formula = utc_time ~ nerc_adj,
  value.var = "excess_load"
)

setnames(
  nerc_eload_dt,
  old = colnames(nerc_eload_dt)[-1],
  new = paste0("excess_load_", tolower(colnames(nerc_eload_dt)[-1]))
)

# Merging with generation data
elec_gen_dt = 
  merge(
    elec_gen_dt,
    nerc_eload_dt,
    by = "utc_time",
    all.x = TRUE
  )

rm(ba_load_dt, nerc_eload_dt, nerc_load_dt)
invisible(gc())

# Adding dummy variables for each nerc region 
elec_gen_dt[,nerc_cal := nerc_adj == "CAL"
][,nerc_mro := nerc_adj == "MRO"
][,nerc_npcc := nerc_adj == "NPCC"
][,nerc_rfc := nerc_adj == "RFC"
][,nerc_serc := nerc_adj == "SERC"
][,nerc_tre := nerc_adj == "TRE"
][,nerc_wecc := nerc_adj == "WECC"
]

# Adding dummy variables for each interconnection
elec_gen_dt[,ic_cal := nerc_adj %in% c("CAL","WECC")
][,ic_mro := nerc_adj %in% c("MRO","NPCC","RFC","SERC")
][,ic_npcc := nerc_adj %in% c("MRO","NPCC","RFC","SERC")
][,ic_rfc := nerc_adj %in% c("MRO","NPCC","RFC","SERC")
][,ic_serc := nerc_adj %in% c("MRO","NPCC","RFC","SERC")
][,ic_tre := nerc_adj == "TRE"
][,ic_wecc := nerc_adj %in% c("CAL","WECC")
]

# creating squared terms 
elec_gen_dt[,':='(
  excess_load_cal_sq = excess_load_cal^2,
  excess_load_mro_sq = excess_load_mro^2,
  excess_load_npcc_sq = excess_load_npcc^2,
  excess_load_rfc_sq = excess_load_rfc^2,
  excess_load_serc_sq = excess_load_serc^2,
  excess_load_tre_sq = excess_load_tre^2,
  excess_load_wecc_sq = excess_load_wecc^2
)]

# Calculating MWH by multiplying by operating time 
elec_gen_dt[,gload_mwh := gload_mw*op_time]

rm(unit_info_dt)
invisible(gc())

# Saving the results  ---------------------------------------------------------
write.fst(
  x = elec_gen_dt,
  path = here("Data/electricity-generation/elec-gen-dt.fst"),
  compress = 100
)


 elec_gen_dt =
  read.fst(
    path = here("Data/electricity-generation/elec-gen-dt.fst"),
    as.data.table = TRUE
  )


# Checking distribution of gload for 9 random units
p_load(ggplot2)
oplc = 
  elec_gen_dt[
    !is.na(gload_mw),.(
      N = .N,
      namepcap_95 = quantile(gload_mw, 0.95),
      namepcap = median(as.numeric(namepcap), na.rm = TRUE)
    ),
    by = .(orispl_code, unitid)
  ][N >1000][sample(1:2191, 9)] 

ex_units_distr_under_namepcap_p = 
  ggplot(
    data = merge(elec_gen_dt, oplc[,.(orispl_code, unitid)], by = c("orispl_code","unitid")), 
    aes(x = gload_mw)
  ) + 
  geom_histogram(bins = 75) +
  geom_vline(
    aes(xintercept = namepcap), 
    linetype = "dashed", color = "red"
  ) +
  labs(
    x = "Hourly Electricity Production (MW)", y = "Count"
  ) +
  facet_wrap(~orispl_code+unitid, scales = "free") + 
  theme_minimal()
ggsave(
  plot = ex_units_distr_under_namepcap_p,
  filename = here("figures/electricity-generation/hist-ex-units-under-namepcap.pdf"),
  device = cairo_pdf,
  width = 8, height = 6
)


# What is distribution of gload relative to namepate capacities
distr_under_namepcap_p =
  elec_gen_dt[,.(gload_under_cap = gload_mw-namepcap)] |>
  ggplot(aes(x = gload_under_cap)) + 
  geom_histogram(bins = 300) + 
  labs(
    x = "Production under Nameplace Capacity",
    y = "Count"
  ) +
  theme_minimal()
ggsave(
  plot = distr_under_namepcap_p,
  filename = here("figures/electricity-generation/hist-under-namepcap.pdf"),
  device = cairo_pdf,
  width = 8, height = 6
)






